PM 566 Assignment 3

Author

Dana Gonzalez

Load Libraries

library(dplyr)
library(tidyverse)
library(tidytext)
library(ggplot2)
library(textdata)
library(DT)

Load CSV File

pubmed <- read.csv("/Users/danagonzalez/Downloads/pubmed.csv")

Text Mining

Part 1

# Tokenize Abstracts and Token Counts
abstract_tokens <- pubmed |>
  unnest_tokens(word, abstract) |>
  count(word, sort = TRUE) |>
  slice_max(n, n = 20)

abstract_tokens |>
  ggplot(aes(n, word)) +
  geom_col(fill = "slategray2") +
  labs(title = "Top 20 Most Frequent Words from 'Abstract' Column (Including Stopwords)",
       x = "Count",
       y = "Word") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Abstract Token Counts without Stopwords
abstract_tokens2 <- pubmed |>
  unnest_tokens(word, abstract) |>
  anti_join(stop_words, by = c("word" = "word")) |>
  count(word, sort=TRUE) |>
  slice_max(n, n = 20)

abstract_tokens2 |>
  ggplot(aes(n, word)) +
  geom_col(fill = "slategray2") +
  labs(title = "Top 20 Most Frequent Words from 'Abstract' Column (Without Stopwords)",
       x = "Count",
       y = "Word") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Tokenize Search Term and Token Counts without Stopwords
term_tokens <- pubmed |>
  unnest_tokens(word, term) |>
  anti_join(stop_words, by = c("word" = "word")) |>
  count(word, sort = TRUE) |>
  slice_max(n, n = 5)

term_tokens |>
  ggplot(aes(n, word)) +
  geom_col(fill = "slategray2") +
  labs(title = "Top 5 Most Frequent Words from 'Term' Column (Without Stopwords)",
       x = "Count",
       y = "Word") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

Part 2

# Tokenize Abstract into Bigrams
abstract_bigrams <- pubmed|>
  unnest_ngrams(ngram, abstract, n = 2) |>
  count(ngram, sort = TRUE) |>
  slice_max(n, n = 10)

abstract_bigrams |>
  ggplot(aes(n, ngram)) +
  geom_col(fill = "slategray2") +
  labs(title = "Top 10 Most Frequent Bi-Grams from 'Abstract' Column",
       x = "Count",
       y = "Bi-Gram") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

Part 3

# Calculate TF-IDF Values
pubmed_tfidf <- pubmed |> 
  unnest_tokens(abstract, abstract) |> 
  count(abstract, term) |> 
  bind_tf_idf(abstract, term, n) |> 
  arrange(desc(tf_idf))

# Determine Top 5 Tokens with Highest TF-IDF Values
top_5_tfidf <- pubmed_tfidf |>
  arrange(desc(tf_idf))|>
  slice_head(n = 5) |>           
  ungroup()

top_5_tfidf
      abstract            term    n          tf      idf     tf_idf
1        covid           covid 7275 0.037105042 1.609438 0.05971826
2     prostate prostate cancer 3832 0.031188957 1.609438 0.05019669
3    eclampsia    preeclampsia 2005 0.014278389 1.609438 0.02298018
4 preeclampsia    preeclampsia 1863 0.013267152 1.609438 0.02135266
5   meningitis      meningitis  429 0.009194171 1.609438 0.01479745

There are significant differences in the top five tokens between part one and part three. Part one returns the following terms (in order): prostate, preeclampsia, fibrosis/cystic (tie), covid, and cancer. Part three returns the following terms (again, in order): covid, prostate cancer, preeclampsia, preeclampsia (again), and meningitis. While there are some commonalities between the two methods (i.e., with “covid”, although they are not in the same order), part three groups “prostate cancer” into one term rather than two (as it is in part one). Too, part three counts preeclampsia twice.

Sentiment Analysis

Part 1

# Perform Sentiment Analysis with NRC Lexicon
nrc_lexicon <- get_sentiments("nrc")
pubmed_sent <- pubmed |>
  unnest_tokens(word, abstract) |>
  inner_join(nrc_lexicon, by = "word", relationship = "many-to-many")

datatable(pubmed_sent, 
          class = 'cell-border stripe',
          options = list(pageLength = 10,         
                         lengthMenu = c(5, 10, 20, 50), 
                         searching = TRUE,      
                         ordering = TRUE))
# Determine Most Common Sentiment Per Search Term
common_sent <- pubmed_sent |>
  group_by(term, sentiment) |>
  summarise(count = n(), .groups = 'drop') |>
  distinct(term, sentiment, count) |>
  group_by(term) |>
  slice_max(count, n = 1) |>
  ungroup()

common_sent
# A tibble: 5 × 3
  term            sentiment count
  <chr>           <chr>     <int>
1 covid           positive   9874
2 cystic fibrosis positive   2747
3 meningitis      negative   2109
4 preeclampsia    positive   8014
5 prostate cancer negative   8918
# Determine Most Common Sentiment Per Search Term After Removing "Positive" and "Negative"
no_pos_neg <- pubmed_sent |>
  filter(!sentiment %in% c("positive", "negative")) |>
  group_by(term) |>
  count(sentiment) |>
  arrange(desc(n)) |>
  slice(1) |>
  ungroup()

no_pos_neg
# A tibble: 5 × 3
  term            sentiment        n
  <chr>           <chr>        <int>
1 covid           fear          7730
2 cystic fibrosis disgust       1714
3 meningitis      fear          1510
4 preeclampsia    anticipation  4780
5 prostate cancer fear          8118

Part 2

# Obtain Average Postivity Scores for Abstracts Using AFINN Lexicon
afinn_lexicon <- get_sentiments("afinn")
pubmed_sent_afinn <- pubmed |>
  mutate(abstract_no = row_number()) |>
  unnest_tokens(word, abstract) |>
  inner_join(afinn_lexicon, by = "word")

pos_scores <- pubmed_sent_afinn |>
  group_by(abstract_no) |>
  summarize(avg_score = mean(value, na.rm = TRUE)) |>
  ungroup()

pos_scores
# A tibble: 3,163 × 2
   abstract_no avg_score
         <int>     <dbl>
 1           1    -1.13 
 2           2    -2    
 3           3     0.214
 4           4    -0.8  
 5           5    -1.67 
 6           6    -0.143
 7           7     0.167
 8           8     0.909
 9           9    -0.222
10          10    -1.62 
# ℹ 3,153 more rows
# Visualize Score by Term
scores_by_term <- pubmed |>
  mutate(abstract_no = row_number()) |>
  left_join(pos_scores, by = "abstract_no")

ggplot(scores_by_term, aes(x = term, y = avg_score)) +
  geom_boxplot(fill = "slategray2") +
  labs(
    title = "Average Positivity Scores by Search Term",
    x = "Search Term",
    y = "Average Positivity Score"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

The term with the highest average positivity score is “cystic fibrosis”, with a mean value of approximately 0.5. The distinction between the term with the lowest average positivity score is less clear, with the terms “covid”, “meningitis”, and “prostate cancer” each having a mean score at approximately -0.1. Of the five unique terms, “cystic fibrosis” is the only term with a positive mean value.